In [ ]:
from planet4.dbscan import DBScanner

from planet4 import io, clustering, plotting, markings, dbscan
import seaborn as sns
sns.set_context('notebook')
blotchcols = markings.Blotch.to_average
fancols = markings.Fan.to_average

In [ ]:
# not automatically initialized
%matplotlib ipympl

In [ ]:
import socket
if socket.gethostname().startswith('macd2860'):
    %config InlineBackend.figure_format = 'retina'

In [ ]:
%config InlineBackend.figure_format = 'png'

In [ ]:
from nbtools.logging import setup_live_logging
import logging
logger = setup_live_logging('planet4.dbscan', logging.DEBUG)

Setup ids


In [ ]:
def get_gold_ids(person):
    """Get gold data
    
    Pararemeters
    ------------
    person : {"GP", "MES", "KMA", "common_gold_data"}
    
    Returns
    -------
    pd.Series
    """
    path = Path("/Users/klay6683/Dropbox/Documents/latex_docs/p4_paper1/gold_data")
    return pd.read_csv(path / f"{person}.txt", header=None, squeeze=True)

In [ ]:
ids = get_gold_ids('common_gold_data')

In [ ]:
ids = 'br5 bu5 ek1 pbr 1dt 1dr 1fe dch bvc 1c5 1ab 1dk 18s 1b0 1cl 1ct 1at 1al 1aa 10p 185 139 13t 15k 17a'.split()

In [ ]:
def create_and_save_randoms():
    myids = np.random.choice(ids, 100)
    np.save('myids.npy', myids)

myids = np.load('myids.npy')

len(myids)

In [ ]:
combined = list(ids) + list(myids)

In [ ]:
%store combined

In [ ]:
db = DBScanner(savedir='gold_with_angle_std', do_large_run=True)

In [ ]:
for id_ in ids:
    print(id_)
    db.cluster_image_id(id_)

In [ ]:
bucket = []
for img_id in ids:
    p4id = markings.ImageID(img_id, scope='planet4', data=db.data)
    db.pm.obsid = p4id.image_name
    db.pm.id = img_id
    try:
        bucket.extend(db.pm.fandf.angle_std.values)
    except FileNotFoundError:
        continue

In [ ]:
len(bucket)

In [ ]:
bucket = np.array(bucket)

In [ ]:
import seaborn as sns

In [ ]:
sns.set_context('paper')

In [ ]:
bins = np.arange(0, 22, 1)

In [ ]:
pd.Series(bucket).to_csv("angle_std_bucket.csv", index=False)

In [ ]:
fig, ax = plt.subplots(constrained_layout=True)
sns.distplot(bucket, kde=False, bins=bins)
ax.set_title("Histogram of angular STD for merged fan clusters")
ax.set_xlabel("Fan angle standard deviation per cluster [deg]")
ax.set_ylabel("Histogram Counts")

In [ ]:
db.pm.fanfile

In [ ]:
db.pm.fandf.angle_std

In [ ]:
np.save('combined_ids_to_check.npy', np.array(combined))

In [ ]:
from nbtools import execute_in_parallel

In [ ]:
def process_id(id_):
    from planet4.dbscan import DBScanner
    db = DBScanner(savedir='newest_clustering_review', do_large_run=True)
    for kind in ['fan', 'blotch']:
        db.parameter_scan(id_, kind, 
                      msf_vals_to_scan=[0.1, 0.13],
                      eps_vals_to_scan=[20, 25, 30],
                      size_to_scan='large')

Here's my comments from the review"

APF0000br5 - seems like the big blotch should have been seen

APF0000bu5 - seems like middle fan should be there - seems too strict a cut not clustering issue?

APF0000ek1- yellow final blotch comes out of no where

APF0000pbr - bottom right blotch seems like it should have survived

APF00001dt - cyan fan seems bigger than it should be


In [ ]:
results = execute_in_parallel(process_id, combined)

In [ ]:
for id_ in ids:
    print(id_)
    for kind in ['blotch']:
        print(kind)
        dbscanner = DBScanner(savedir='do_cluster_on_large', do_large_run=True)
#         dbscanner.parameter_scan(kind, [0.1, 0.13], [30, 50, 70])
        # for blotch:
        dbscanner.cluster_and_plot(id_, kind, saveplot=True)
        plt.close('all')

In [ ]:
for id_ in ithaca_sample:
    print(id_)
    for kind in ['blotch']:
        print(kind)
        dbscanner = DBScanner(id_)
#         dbscanner.parameter_scan(kind, [0.1, 0.13], [30, 50, 70])
        # for blotch:
        dbscanner.parameter_scan(kind, [0.1, 0.13], [15, 22, 30])
        plt.close('all')

In [ ]:
for id_ in ithaca_sample:
    print(id_)
    for kind in ['fan']:
        print(kind)
        dbscanner = DBScanner(id_)
        dbscanner.parameter_scan(kind, [0.1, 0.13], [30, 50, 70])
        # for blotch:
#         dbscanner.parameter_scan(kind, [0.1, 0.13], [15, 22, 30])
        plt.close('all')

In [ ]:
from shapely.geometry import Point

p1 = Point(266.4, 470.56)
p2 = Point(262.072, 469.679)

p1.distance(p2)

single item checking


In [ ]:
%matplotlib ipympl

In [ ]:
from planet4.catalog_production import ReleaseManager

In [ ]:
rm = ReleaseManager('v1.0')
rm.savefolder

In [ ]:
db = DBScanner(savedir='examples_for_paper', do_large_run=True)

In [ ]:
db.eps_values

In [ ]:
db.cluster_and_plot('arp', 'fan')

In [ ]:
plotting.plot_image_id_pipeline('gr0', datapath='gold_per_obsid', via_obsid=True)

In [ ]:
plt.close('all')

In [ ]:
id_ = ids[14]

In [ ]:
db.parameter_scan(id_, 'fan', msf_vals_to_scan=(0.1, 0.13),
                  eps_vals_to_scan=(10, 20, 30), size_to_scan='small')

In [ ]:
plotting.plot_image_id_pipeline(id_, datapath=rm.savefolder, save=True, saveroot='./plots')

In [ ]:
data = io.DBManager().get_image_id_markings('arp')

In [ ]:
data.classification_id.nunique()

In [ ]:
data.groupby(['classification_id', 'user_name']).marking.value_counts()

In [ ]:
data[data.marking=='blotch'].shape

In [ ]:
db.parameter_scan('bsn', 'blotch', [0.10, 0.13], [10, 12, 14], size_to_scan='small', )

In [ ]:
v1 = (8.9, 87.3)
v2 = (19.8, 79.8)

In [ ]:
v1 = np.array(v1)
v2 = np.array(v2)

In [ ]:
from numpy.linalg import norm

In [ ]:
norm(v1 - v2)

In [ ]:
norm(np.array(v1), np.array(v2))

In [ ]:
db.save_results

In [ ]:
db.final_clusters['blotch']

In [ ]:
import seaborn as sns
sns.set_context('notebook')

In [ ]:
import itertools

palette = itertools.cycle(sns.color_palette('bright'))
fig, ax = plt.subplots()

for b in db.final_clusters['blotch'][1]:
    db.p4id.plot_blotches(data=b, user_color=next(palette), ax=ax)
    ax.set_title('second round')
fig.savefig('second_round.png', dpi=150)

In [ ]:
db.parameter_scan('1wg', 'fan', 
                  msf_vals_to_scan=[0.1, 0.13],
                  eps_vals_to_scan=[20, 25, 30],
                  size_to_scan='large')

In [ ]:
db.parameter_scan('15k', 'blotch', 
                  msf_vals_to_scan=[0.1, 0.13],
                  eps_vals_to_scan=[10, 12, 15],
                  size_to_scan='small')

In [ ]:
fig, ax = plt.subplots()
db.p4id.plot_blotches(ax=ax)
ax.set_title('input data')
fig.savefig('input_data.png', dpi=150)

In [ ]:
blotches = db.p4id.filter_data('blotch').dropna(how='all', axis=1)

In [ ]:
blotches['x y radius_1 radius_2 angle'.split()].sort_values(by='radius_1')

In [ ]:
fans = db.p4id.filter_data('fan')

In [ ]:
xyclusters = pd.concat(db.cluster_xy(blotches, 15)).dropna(how='all', axis=1)

In [ ]:
blotches.shape

In [ ]:
xyclusters.shape

In [ ]:
blotches[~blotches.isin(xyclusters).all(1)].shape

In [ ]:


In [ ]:


In [ ]:


In [ ]:
db.eps_values['blotch']['angle']= None

In [ ]:
db.eps_values['blotch']['angle']= 20

In [ ]:
db.eps_values['blotch']['radius']['small']=30

In [ ]:
db.eps_values

In [ ]:
db.parameter_scan('bp7', 'blotch', [0.1, 0.13], [15,22,30], 'small')

In [ ]:
db.cluster_image_id('bz7')

In [ ]:
db.cluster_and_plot('bz7', 'blotch')

In [ ]:
db.min_samples

In [ ]:
db.cluster_image_id('bb6')

In [ ]:
db.final_clusters['blotch'][0][4][markings.Blotch.to_average+['user_name']]

In [ ]:
db.final_clusters['blotch'][0][2][markings.Blotch.to_average+['user_name']]

In [ ]:


In [ ]:
%debug

In [ ]:
db.parameter_scan('blotch', [0.1, 0.13], [15, 22, 30])

In [ ]:
db.parameter_scan('fan', [0.1,0.15], [30, 50,70])

In [ ]:
db.pipeline(10, 3, 50)

In [ ]:
db.store_folder

In [ ]:
sizes = []
for _,b in blotches.iterrows():
    B = markings.Blotch(b, scope='planet4')
    sizes.append(B.area)

In [ ]:
%matplotlib nbagg

In [ ]:
plt.figure()
plt.hist(sizes, bins=50);

In [ ]:
db.parameter_scan('fan', [0.1,0.15], [10, 15, 20])

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:
db.cluster_and_plot('blotch', 20, 3)
ax = plt.gca()
ax.get_title()

In [ ]:


In [ ]:
db.parameter_scan('fan', [0.07, 0.1, 0.15], [15,20])

In [ ]:
db.parameter_scan('blotch', [0.07, 0.1, 0.15], [15,20])

In [ ]:
ek1.cluster_and_plot('blotch', 20, 3)

In [ ]:
ek1.p4id.plot_blotches(data=ek1.finalclusters[5])

In [ ]:
ek1.p4id.plot_blotches(data=ek1.averaged[5])

In [ ]:


In [ ]:


In [ ]:
p4id = markings.ImageID('1fe', scope='planet4')
blotches = p4id.get_blotches()

In [ ]:
X = blotches['x y'.split()]

In [ ]:
dbscanner = DBScanner(X, min_samples=5, eps=20)

In [ ]:
clusters = [blotches.loc[idx] for idx in dbscanner.clustered_indices]

In [ ]:
from planet4.clustering import cluster_angles

In [ ]:
bucket = []
for cluster in clusters:
    print(cluster.shape)
    bucket.append([cluster.loc[idx] for idx in cluster_angles(cluster, 'blotch', 5)])

In [ ]:
for item in bucket:
    for subitem in item:
        print(subitem.shape)

In [ ]:
cluster_and_plot('1dr', production=True, dynamic=True,
                         msf=msf, eps=eps, radii=False, dbscan=True,
                         figtitle=figtitle)

In [ ]:


In [ ]:
cm = cluster_and_plot('1dt', production=False, msf=0.1, dynamic=True,
                      radii=False, dbscan=False)

In [ ]:
df = pd.read_csv('fuckdf.csv')

In [ ]:
(df - df.mean(axis=0))/df.std(axis=0)

In [ ]:
df[df.apply(lambda x: np.abs(x - x.mean()) / x.std() < 1).all(axis=0)]

In [ ]:
from scipy.stats import zscore

In [ ]:
zscore??

In [ ]:
pd.DataFrame(zscore(df,ddof=1))

In [ ]:
def highlight_bigger_std(x):
    '''
    highlight the maximum in a Series yellow.
    '''
    is_true = (np.abs(x - x.mean()) / x.std() > 2)
    return ['background-color: yellow' if v else '' for v in is_true]
#     return is_true

In [ ]:
df.style.apply(highlight_bigger_std)

In [ ]:


In [ ]:


In [ ]:
cm = cluster_and_plot('pbr', production=False, msf=0.1, dynamic=True,
                      radii=False)

In [ ]:
cm = cluster_and_plot('pbr',eps=20, production=False, msf=0.1, dynamic=True,
                      radii=True)

In [ ]:
cm.db

In [ ]:
imgid = '1at'
imgid = 'dch'
imgid = 'bvc'
imgid = '1dr'
imgid = '1fe'
imgid = 'br5'
imgid = 'ek1'
p4id = markings.ImageID(imgid, scope='planet4')

In [ ]:
data = p4id.get_blotches()

In [ ]:
from planet4.dbscan import DBScanner

In [ ]:
current_X = data[['x','y']].values

In [ ]:
clusterer = DBScanner(current_X, eps=15, min_samples=3)

In [ ]:
clusterer.n_clusters_

In [ ]:
cluster = data.loc[clusterer.clustered_indices[0]]

p4id.plot_blotches(blotches=cluster,with_center=True)

In [ ]:
cluster[blotchcols]

In [ ]:
indices = clustering.cluster_angles(cluster, 'blotch', eps_blotchangle=10)
indices

In [ ]:
angle_cluster_data = cluster.loc[indices[0], blotchcols +['user_name']]

In [ ]:
angle_cluster_data

In [ ]:
df = angle_cluster_data[blotchcols]

In [ ]:
df[df.apply(lambda x: np.abs(x - x.mean()) / x.std() < 1).all(axis=1)]

In [ ]:
clustering.get_average_object(angle_cluster_data[blotchcols], 'blotch')

In [ ]:
p4id.plot_blotches(blotches=cluster.loc[indices[0]], with_center=True)

In [ ]:
df = cluster.loc[indices[0]][blotchcols]

In [ ]:
df['area'] = df.apply(lambda x: np.pi*x.radius_1*x.radius_2, axis=1)

In [ ]:
df

In [ ]:
col='radius_1'

In [ ]:
df.radius_1.std()

In [ ]:
df[np.abs(df[col]-df[col].mean())<=(1*df[col].std())]

In [ ]:
df[df.apply(lambda x: np.abs(x - x.mean()) / x.std() < 1).all(axis=1)]

In [ ]:
subclus

In [ ]:
testblotch = markings.Blotch?

In [ ]:
testblotchdata = dict(x=340, y=340, angle=127, radius_1=250, radius_2=186)

In [ ]:
testblotch = markings.Blotch(
    pd.DataFrame(
        testblotchdata, index=[0]), scope='planet4')
fig, ax = plt.subplots()
ax.add_artist(testblotch)
ax.set_xlim(0, 800)
ax.set_ylim(0, 600)

In [ ]:
testblotch = markings.Blotch(
    pd.DataFrame(testblotchdata, index=[0]),
    scope='planet4')

p4id.plot_blotches(blotches=[testblotch])

In [ ]:
from sklearn.cluster import DBSCAN


class DBScanner(object):
    """Execute clustering and create mean cluster markings.

    The instantiated object will execute:

        * _run_DBSCAN() to perform the clustering itself
        * _post_analysis() to create mean markings from the clustering results


    Parameters
    ----------
    current_X : numpy.array
        array holding the data to be clustered, preprocessed in ClusterManager
    eps : int, optional
        Distance criterion for DBSCAN algorithm. Samples further away than this value don't
        become members of the currently considered cluster. Default: 10
    min_samples : int, optional
        Mininum number of samples required for a cluster to be created. Default: 3
    """

    def __init__(self, X, eps=15, min_samples=3, only_core=False):
        self.X = X
        self.eps = eps
        self.min_samples = min_samples
        self.only_core = only_core

        # these lines execute the clustering
        self._run_DBSCAN()

    def _run_DBSCAN(self):
        """Perform the DBSCAN clustering."""
        db = DBSCAN(self.eps, self.min_samples).fit(self.X)
        core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
        core_samples_mask[db.core_sample_indices_] = True

        labels = db.labels_
        unique_labels = set(labels)
        colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))

        self.n_clusters_ = len(unique_labels) - (1 if -1 in labels else 0)

        self.clustered_indices = []  # list of `kind` cluster average objects
        self.n_rejected = 0
        # loop over unique labels.
        for k, col in zip(unique_labels, colors):
            # get indices for members of this cluster
            class_member_mask = (labels == k)
            if self.only_core:
                cluster_members = (class_member_mask & core_samples_mask)
            else:
                cluster_members = class_member_mask

            if k == -1:
                col = 'black'
                self.n_rejected = len(cluster_members)
            else:
                xy = self.X[cluster_members]
                if xy.shape[1] > 1:
                    y = xy[:, 1]
                else:
                    y = [0] * xy.shape[0]
                plt.plot(
                    xy[:, 0],
                    y,
                    'o',
                    markerfacecolor=col,
                    markeredgecolor='black',
                    markersize=14)
                xy = self.X[class_member_mask & ~core_samples_mask]
                if xy.shape[1] > 1:
                    y = xy[:, 1]
                else:
                    y = [0] * xy.shape[0]
                plt.plot(
                    xy[:, 0],
                    y,
                    'o',
                    markerfacecolor=col,
                    markeredgecolor='black',
                    markersize=6)
                self.clustered_indices.append(cluster_members)
        plt.gca().invert_yaxis()
        plt.title('Estimated number of clusters: %d' % self.n_clusters_)
        self.db = db

In [ ]:
cluster[blotchcols]

In [ ]:
xy_angles = clustering.angle_to_xy(cluster.angle, 'blotch')

In [ ]:
xy_angles

In [ ]:
xy_angles.shape

In [ ]:
plt.figure(figsize=(5*1.3,5))
clusterer = DBScanner(xy_angles, eps=20*np.pi/360, min_samples=3)

In [ ]:
data.loc[clusterer.clustered_indices[1]]

In [ ]:
for cluster_members in clusterer.clustered_indices:
    clusterdata = data.loc[cluster_members, blotchcols + ['user_name']]
    print(len(clusterdata))
    angle_clustered = clustering.cluster_angles(clusterdata, 'blotch')
    for indices in angle_clustered:
        angle_clusterdata = clusterdata.loc[indices, blotchcols +
                                            ['user_name']]
        filtered = angle_clusterdata.groupby('user_name').first()
        print(len(filtered))

In [ ]:
cm.min_samples

In [ ]:
30* cm.min_samples_factor

In [ ]:
cm.reduced_data['blotch']

In [ ]:
cm.cluster_angles

In [ ]:
db = clustering.cluster_angles(cluster, 'blotch')
len(db[0])

In [ ]:
len(cluster)

In [ ]:


In [ ]:


In [ ]:


In [ ]:
filtered = cluster.groupby('user_name').first()

In [ ]:
plt.figure()
filtered.angle.hist()

In [ ]:


In [ ]:
toprint = cluster2[markings.Fan.to_average + ['user_name', 'marking', 'classification_id']]

In [ ]:
toprint.to_clipboard(index=False)

In [ ]:
def add_angle_vector(df):
    new = df.copy()
    new['xang'] = np.cos(np.deg2rad(df.angle))
    new['yang'] = np.sin(np.deg2rad(df.angle))
    return new

In [ ]:
cluster2 = add_angle_vector(cluster2)

In [ ]:
cluster2

testing angle deltas


In [ ]:
def angle_to_xy(angle):
    x = np.cos(np.deg2rad(angle))
    y = np.sin(np.deg2rad(angle))
    return np.vstack([x,y]).T

In [ ]:
def cluster_angles(angles, delta_angle):
    dist_per_degree = 0.017453070996747883
    X = angle_to_xy(angles)
    clusterer = DBScanner(X, eps=delta_angle*dist_per_degree, min_samples=3)
    return clusterer

In [ ]:
clusterer = cluster_angles(cluster.angle, 10)

In [ ]:
clusterer.db.core_sample_indices_

In [ ]:
clusterer.db.labels_

In [ ]:
cluster.shape

In [ ]:
clusterer.clustered_indices

In [ ]:
cluster2.iloc[clusterer.clustered_data[0]]

In [ ]:


In [ ]:
dbscanner.reduced_data[0]

this means all ellipses were clustered together. eps=10 picks 3 out of these 6.


In [ ]:
clusterdata = data.iloc[dbscanner.reduced_data[0]]

so clusterdata is just the same as the input data, i just repeat the exact same code steps here for consistency.


In [ ]:
clusterdata[blotchcols]

In [ ]:


In [ ]:
meandata = clusterdata.mean()
meandata

In [ ]:
from scipy.stats import circmean

In [ ]:
meandata.angle = circmean(clusterdata.angle, high=180)

In [ ]:
meandata

In [ ]:
n_class_old = data.classification_id.nunique()
n_class_old

In [ ]:
# number of classifications that include fan and blotches
f1 = data.marking == 'fan'
f2 = data.marking == 'blotch'
n_class_fb = data[f1 | f2].classification_id.nunique()
n_class_fb

In [ ]:
data=data[data.marking=='blotch']

In [ ]:
plotting.plot_raw_blotches('bvc')

In [ ]:
fans.plot(kind='scatter', x='x',y='y')
plt.gca().invert_yaxis()

In [ ]:
fx1 = data.x < 400 
fx2 = data.x > 300
fy1 = data.y_R > 300
fy2 = data.y_R < 400

In [ ]:
data = data.reset_index()

In [ ]:
data[fx1 & fx2 & fy1 & fy2].angle

In [ ]:
cm.dbscanner.reduced_data

testing cluster_image_name


In [ ]:
dbscanner = dbscan.DBScanner()

In [ ]:
db = io.DBManager()

In [ ]:
data = db.get_obsid_markings('ESP_020568_0950')

In [ ]:
image_ids = data.image_id.unique()

In [ ]:
%matplotlib nbagg
import seaborn as sns
sns.set_context('notebook')

In [ ]:
p4id = markings.ImageID(image_ids[0])
p4id.plot_fans()

In [ ]:
p4id.plot_fans(data=p4id.data.query('angle>180'))

In [ ]:
p4id.imgid

In [ ]:
data[data.marking=='fan'].angle.describe()

In [ ]:
dbscanner.cluster_image_name('PSP_002622_0945')

In [ ]:
db = io.DBManager()

In [ ]:
db.get_image_name_markings('PSP_002622_0945')

Cluster random samples of obsids


In [ ]:
obsids = 'ESP_020476_0950, ESP_011931_0945, ESP_012643_0945, ESP_020783_0950'.split(', ')

In [ ]:
obsids

In [ ]:
def process_obsid(obsid):
    from planet4.catalog_production import do_cluster_obsids
    do_cluster_obsids(obsid, savedir=obsid)
    return obsid

In [ ]:
from nbtools import execute_in_parallel

In [ ]:
execute_in_parallel(process_obsid, obsids)

In [ ]:
db = io.DBManager()

for obsid in obsids:
    data = db.get_image_name_markings(obsid)
    image_ids = data.image_id.drop_duplicates().sample(n=50)
    for id_ in image_ids:
        print(id_)
        plotting.plot_image_id_pipeline(id_, datapath=obsid, save=True,
                                        saveroot=f'plots/{obsid}',
                                        via_obsid=True)
        plt.close('all')

In [ ]:
plotting.plot_finals('prv', datapath=obsids[0], via_obsid=True)

In [ ]: